UCI SMS Spam Collection Dataset

  • Input: sms textual content. Target: ham or spam
  • data representation: each sms is repesented with a fixed-length vector of word indexes. A word index lookup is generated from the vocabulary list.
  • words embedding: A word embedding (dense vector) is learnt for each word. That is, each sms is presented as a matrix of (document-word-count, word-embedding-size)
  • RNN: the embeddings of the words are treated as sequence in an LSTM RNN
  • train-data.tsv, valid-datat.tsv, and vocab_list.tsv are prepared and saved in 'data/sms-spam'

In [1]:
import tensorflow as tf
from tensorflow import data
from datetime import datetime
import multiprocessing
import shutil

print(tf.__version__)


/Users/khalidsalama/anaconda/lib/python3.6/importlib/_bootstrap.py:205: RuntimeWarning: compiletime version 3.5 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.6
  return f(*args, **kwds)
1.4.1

In [2]:
MODEL_NAME = 'sms-class-model-01'

TRAIN_DATA_FILES_PATTERN = 'data/sms-spam/train-*.tsv'
VALID_DATA_FILES_PATTERN = 'data/sms-spam/valid-*.tsv'

VOCAB_LIST_FILE = 'data/sms-spam/vocab_list.tsv'
N_WORDS_FILE = 'data/sms-spam/n_words.tsv'

RESUME_TRAINING = False
MULTI_THREADING = True

1. Define Dataset Metadata


In [3]:
MAX_DOCUMENT_LENGTH = 50

PAD_WORD = '#=KS=#'

HEADER = ['class', 'sms']
HEADER_DEFAULTS = [['NA'], ['NA']]

TEXT_FEATURE_NAME = 'sms'

TARGET_NAME = 'class'

WEIGHT_COLUNM_NAME = 'weight'

TARGET_LABELS = ['spam', 'ham']

with open(N_WORDS_FILE) as file:
    N_WORDS = int(file.read())+2
print(N_WORDS)


11332

2. Define Data Input Function

a. TSV parsing logic


In [4]:
def parse_tsv_row(tsv_row):
    
    columns = tf.decode_csv(tsv_row, record_defaults=HEADER_DEFAULTS, field_delim='\t')
    features = dict(zip(HEADER, columns))
    
    target = features.pop(TARGET_NAME)
    
    # giving more weight to "spam" records are the are only 13% of the training set
    features[WEIGHT_COLUNM_NAME] =  tf.cond( tf.equal(target,'spam'), lambda: 6.6, lambda: 1.0 ) 

    return features, target

b. Data pipeline input function


In [5]:
def parse_label_column(label_string_tensor):
    table = tf.contrib.lookup.index_table_from_tensor(tf.constant(TARGET_LABELS))
    return table.lookup(label_string_tensor)

def input_fn(files_name_pattern, mode=tf.estimator.ModeKeys.EVAL, 
                 skip_header_lines=0, 
                 num_epochs=1, 
                 batch_size=200):
    
    shuffle = True if mode == tf.estimator.ModeKeys.TRAIN else False
    
    num_threads = multiprocessing.cpu_count() if MULTI_THREADING else 1
    
    buffer_size = 2 * batch_size + 1
   
    print("")
    print("* data input_fn:")
    print("================")
    print("Input file(s): {}".format(files_name_pattern))
    print("Batch size: {}".format(batch_size))
    print("Epoch Count: {}".format(num_epochs))
    print("Mode: {}".format(mode))
    print("Thread Count: {}".format(num_threads))
    print("Shuffle: {}".format(shuffle))
    print("================")
    print("")

    file_names = tf.matching_files(files_name_pattern)
    dataset = data.TextLineDataset(filenames=file_names)
    
    dataset = dataset.skip(skip_header_lines)
    
    if shuffle:
        dataset = dataset.shuffle(buffer_size)
        
    dataset = dataset.map(lambda tsv_row: parse_tsv_row(tsv_row), 
                          num_parallel_calls=num_threads)
    
    dataset = dataset.batch(batch_size)
    dataset = dataset.repeat(num_epochs)
    dataset = dataset.prefetch(buffer_size)
    
    iterator = dataset.make_one_shot_iterator()
    
    features, target = iterator.get_next()
    return features, parse_label_column(target)

3. Define Model Function


In [6]:
def process_text(text_feature):
    
    # Load vocabolary lookup table to map word => word_id
    vocab_table = tf.contrib.lookup.index_table_from_file(vocabulary_file=VOCAB_LIST_FILE, 
                                                          num_oov_buckets=1, default_value=-1)
    # Get text feature
    smss = text_feature
    # Split text to words -> this will produce sparse tensor with variable-lengthes (word count) entries
    words = tf.string_split(smss)
    # Convert sparse tensor to dense tensor by padding each entry to match the longest in the batch
    dense_words = tf.sparse_tensor_to_dense(words, default_value=PAD_WORD)
    # Convert word to word_ids via the vocab lookup table
    word_ids = vocab_table.lookup(dense_words)
    # Create a word_ids padding
    padding = tf.constant([[0,0],[0,MAX_DOCUMENT_LENGTH]])
    # Pad all the word_ids entries to the maximum document length
    word_ids_padded = tf.pad(word_ids, padding)
    word_id_vector = tf.slice(word_ids_padded, [0,0], [-1, MAX_DOCUMENT_LENGTH])
    
    # Return the final word_id_vector
    return word_id_vector


def model_fn(features, labels, mode, params):
    
    hidden_units = params.hidden_units
    output_layer_size = len(TARGET_LABELS)
    embedding_size = params.embedding_size
    forget_bias = params.forget_bias
    keep_prob = params.keep_prob
    
    # word_id_vector
    word_id_vector = process_text(features[TEXT_FEATURE_NAME]) 
    # print("word_id_vector: {}".format(word_id_vector)) # (?, MAX_DOCUMENT_LENGTH)
    
    # layer to take each word_id and convert it into vector (embeddings) 
    word_embeddings = tf.contrib.layers.embed_sequence(word_id_vector, vocab_size=N_WORDS, 
                                                 embed_dim=embedding_size) 
    #print("word_embeddings: {}".format(word_embeddings)) # (?, MAX_DOCUMENT_LENGTH, embbeding_size)
    
    # configure the RNN
    rnn_layers = [tf.nn.rnn_cell.LSTMCell(
        num_units=size, 
        forget_bias=params.forget_bias,
        activation=tf.nn.tanh) for size in hparams.hidden_units]

    # create a RNN cell composed sequentially of a number of RNNCells
    multi_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers)
    
    input_layer = tf.unstack(word_embeddings, axis=1)
    # list of len(MAX_DOCUMENT_LENGTH), each element is (?,  embbeding_size)
    #print("input_layer: {}".format(input_layer)) 
    
    outputs, _ = tf.nn.static_rnn(cell=multi_rnn_cell, 
                                inputs=input_layer, 
                                dtype=tf.float32)
    
    # slice to keep only the last cell of the RNN
    rnn_output = outputs[-1]

    # Connect the output layer (logits) to the hidden layer (no activation fn)
    logits = tf.layers.dense(inputs=rnn_output, 
                             units=output_layer_size, 
                             activation=None)
    # print("logits: {}".format(logits)) # (?, output_layer_size)

    # Provide an estimator spec for `ModeKeys.PREDICT`.
    if mode == tf.estimator.ModeKeys.PREDICT:
        probabilities = tf.nn.softmax(logits)
        predicted_indices = tf.argmax(probabilities, 1)

        # Convert predicted_indices back into strings
        predictions = {
            'class': tf.gather(TARGET_LABELS, predicted_indices),
            'probabilities': probabilities
        }
        export_outputs = {
            'prediction': tf.estimator.export.PredictOutput(predictions)
        }
        
        # Provide an estimator spec for `ModeKeys.PREDICT` modes.
        return tf.estimator.EstimatorSpec(mode,
                                          predictions=predictions,
                                          export_outputs=export_outputs)
    
    # weights
    weights = features[WEIGHT_COLUNM_NAME]

    # Calculate loss using softmax cross entropy
    loss = tf.losses.sparse_softmax_cross_entropy(
        logits=logits, labels=labels, 
        weights=weights
    )
    
    tf.summary.scalar('loss', loss)
    
    if mode == tf.estimator.ModeKeys.TRAIN:
        # Create Optimiser
        optimizer = tf.train.AdamOptimizer(params.learning_rate)

        # Create training operation
        train_op = optimizer.minimize(
            loss=loss, global_step=tf.train.get_global_step())

        # Provide an estimator spec for `ModeKeys.TRAIN` modes.
        return tf.estimator.EstimatorSpec(mode=mode,
                                          loss=loss, 
                                          train_op=train_op)
        

    if mode == tf.estimator.ModeKeys.EVAL:
        probabilities = tf.nn.softmax(logits)
        predicted_indices = tf.argmax(probabilities, 1)

        # Return accuracy and area under ROC curve metrics
        labels_one_hot = tf.one_hot(
            labels,
            depth=len(TARGET_LABELS),
            on_value=True,
            off_value=False,
            dtype=tf.bool
        )
        
        eval_metric_ops = {
            'accuracy': tf.metrics.accuracy(labels, predicted_indices, weights=weights),
            'auroc': tf.metrics.auc(labels_one_hot, probabilities, weights=weights)
        }
        
        # Provide an estimator spec for `ModeKeys.EVAL` modes.
        return tf.estimator.EstimatorSpec(mode, 
                                          loss=loss, 
                                          eval_metric_ops=eval_metric_ops)

def create_estimator(run_config, hparams):
    estimator = tf.estimator.Estimator(model_fn=model_fn, 
                                  params=hparams, 
                                  config=run_config)
    
    print("")
    print("Estimator Type: {}".format(type(estimator)))
    print("")

    return estimator

4. Run Experiment

a. Set HParam and RunConfig


In [7]:
TRAIN_SIZE = 4179
NUM_EPOCHS = 100
BATCH_SIZE = 250
EVAL_AFTER_SEC = 60
TOTAL_STEPS = int((TRAIN_SIZE/BATCH_SIZE)*NUM_EPOCHS)

hparams  = tf.contrib.training.HParams(
    num_epochs = NUM_EPOCHS,
    batch_size = BATCH_SIZE,
    embedding_size = 5,
    forget_bias=1.0,
    keep_prob = 0.8,
    hidden_units=[24, 16],
    max_steps = TOTAL_STEPS,
    learning_rate = 0.01
)

model_dir = 'trained_models/{}'.format(MODEL_NAME)

run_config = tf.estimator.RunConfig(
    log_step_count_steps=5000,
    tf_random_seed=19830610,
    model_dir=model_dir
)

print(hparams)
print("Model Directory:", run_config.model_dir)
print("")
print("Dataset Size:", TRAIN_SIZE)
print("Batch Size:", BATCH_SIZE)
print("Steps per Epoch:",TRAIN_SIZE/BATCH_SIZE)
print("Total Steps:", TOTAL_STEPS)
print("That is 1 evaluation step after each",EVAL_AFTER_SEC,"training seconds")


[('batch_size', 250), ('embedding_size', 5), ('forget_bias', 1.0), ('hidden_units', [24, 16]), ('keep_prob', 0.8), ('learning_rate', 0.01), ('max_steps', 1671), ('num_epochs', 100)]
Model Directory: trained_models/sms-class-model-01

Dataset Size: 4179
Batch Size: 250
Steps per Epoch: 16.716
Total Steps: 1671
That is 1 evaluation step after each 60 training seconds

b. Define serving function


In [8]:
def serving_input_fn():
    
    receiver_tensor = {
      'sms': tf.placeholder(tf.string, [None]),
    }
    
    features = {
      key: tensor
      for key, tensor in receiver_tensor.items()
    }
    
    return tf.estimator.export.ServingInputReceiver(
        features, receiver_tensor)

c. Define TrainSpec and EvaluSpec


In [9]:
train_spec = tf.estimator.TrainSpec(
    input_fn = lambda: input_fn(
        TRAIN_DATA_FILES_PATTERN,
        mode = tf.estimator.ModeKeys.TRAIN,
        num_epochs=hparams.num_epochs,
        batch_size=hparams.batch_size
    ),
    max_steps=hparams.max_steps,
    hooks=None
)

eval_spec = tf.estimator.EvalSpec(
    input_fn = lambda: input_fn(
        VALID_DATA_FILES_PATTERN,
        mode=tf.estimator.ModeKeys.EVAL,
        batch_size=hparams.batch_size
    ),
    exporters=[tf.estimator.LatestExporter(
        name="predict", # the name of the folder in which the model will be exported to under export
        serving_input_receiver_fn=serving_input_fn,
        exports_to_keep=1,
        as_text=True)],
    steps=None,
    throttle_secs = EVAL_AFTER_SEC
)

d. Run Experiment via train_and_evaluate


In [10]:
if not RESUME_TRAINING:
    print("Removing previous artifacts...")
    shutil.rmtree(model_dir, ignore_errors=True)
else:
    print("Resuming training...") 

    
tf.logging.set_verbosity(tf.logging.INFO)

time_start = datetime.utcnow() 
print("Experiment started at {}".format(time_start.strftime("%H:%M:%S")))
print(".......................................") 

estimator = create_estimator(run_config, hparams)

tf.estimator.train_and_evaluate(
    estimator=estimator,
    train_spec=train_spec, 
    eval_spec=eval_spec
)

time_end = datetime.utcnow() 
print(".......................................")
print("Experiment finished at {}".format(time_end.strftime("%H:%M:%S")))
print("")
time_elapsed = time_end - time_start
print("Experiment elapsed time: {} seconds".format(time_elapsed.total_seconds()))


Removing previous artifacts...
Experiment started at 17:55:29
.......................................
INFO:tensorflow:Using config: {'_model_dir': 'trained_models/sms-class-model-01', '_tf_random_seed': 19830610, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 5000, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x11a14aeb8>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}

Estimator Type: <class 'tensorflow.python.estimator.estimator.Estimator'>

INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after 60 secs (eval_spec.throttle_secs) or training is finished.

* data input_fn:
================
Input file(s): data/sms-spam/train-*.tsv
Batch size: 250
Epoch Count: 100
Mode: train
Thread Count: 4
Shuffle: True
================

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into trained_models/sms-class-model-01/model.ckpt.
INFO:tensorflow:loss = 1.11233, step = 1
INFO:tensorflow:loss = 0.704967, step = 101 (6.497 sec)
INFO:tensorflow:loss = 0.234601, step = 201 (3.310 sec)
INFO:tensorflow:loss = 0.06392, step = 301 (3.324 sec)
INFO:tensorflow:loss = 0.0623116, step = 401 (3.431 sec)
INFO:tensorflow:loss = 0.188396, step = 501 (3.429 sec)
INFO:tensorflow:loss = 0.0622047, step = 601 (3.240 sec)
INFO:tensorflow:loss = 0.107527, step = 701 (3.230 sec)
INFO:tensorflow:loss = 0.103855, step = 801 (3.270 sec)
INFO:tensorflow:Saving checkpoints for 813 into trained_models/sms-class-model-01/model.ckpt.
INFO:tensorflow:Loss for final step: 0.255177.

* data input_fn:
================
Input file(s): data/sms-spam/valid-*.tsv
Batch size: 250
Epoch Count: 1
Mode: eval
Thread Count: 4
Shuffle: False
================

INFO:tensorflow:Starting evaluation at 2017-12-26-17:57:00
INFO:tensorflow:Restoring parameters from trained_models/sms-class-model-01/model.ckpt-813
INFO:tensorflow:Finished evaluation at 2017-12-26-17:57:01
INFO:tensorflow:Saving dict for global step 813: accuracy = 0.955313, auroc = 0.963212, global_step = 813, loss = 0.339697
INFO:tensorflow:Restoring parameters from trained_models/sms-class-model-01/model.ckpt-813
INFO:tensorflow:Assets added to graph.
INFO:tensorflow:Assets written to: b"trained_models/sms-class-model-01/export/predict/temp-b'1514311026'/assets"
INFO:tensorflow:SavedModel written to: b"trained_models/sms-class-model-01/export/predict/temp-b'1514311026'/saved_model.pbtxt"

* data input_fn:
================
Input file(s): data/sms-spam/train-*.tsv
Batch size: 250
Epoch Count: 100
Mode: train
Thread Count: 4
Shuffle: True
================

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from trained_models/sms-class-model-01/model.ckpt-813
INFO:tensorflow:Saving checkpoints for 814 into trained_models/sms-class-model-01/model.ckpt.
INFO:tensorflow:loss = 0.0556223, step = 814
INFO:tensorflow:loss = 0.290847, step = 914 (5.671 sec)
INFO:tensorflow:loss = 0.250877, step = 1014 (3.257 sec)
INFO:tensorflow:loss = 0.0895318, step = 1114 (3.244 sec)
INFO:tensorflow:loss = 0.255735, step = 1214 (3.235 sec)
INFO:tensorflow:loss = 0.158367, step = 1314 (3.238 sec)
INFO:tensorflow:loss = 0.0101549, step = 1414 (3.232 sec)
INFO:tensorflow:loss = 0.0105816, step = 1514 (3.255 sec)
INFO:tensorflow:loss = 0.0585981, step = 1614 (3.248 sec)
INFO:tensorflow:Saving checkpoints for 1671 into trained_models/sms-class-model-01/model.ckpt.
INFO:tensorflow:Loss for final step: 0.139656.

* data input_fn:
================
Input file(s): data/sms-spam/valid-*.tsv
Batch size: 250
Epoch Count: 1
Mode: eval
Thread Count: 4
Shuffle: False
================

INFO:tensorflow:Starting evaluation at 2017-12-26-17:58:23
INFO:tensorflow:Restoring parameters from trained_models/sms-class-model-01/model.ckpt-1671
INFO:tensorflow:Finished evaluation at 2017-12-26-17:58:25
INFO:tensorflow:Saving dict for global step 1671: accuracy = 0.955723, auroc = 0.971936, global_step = 1671, loss = 0.369961
INFO:tensorflow:Restoring parameters from trained_models/sms-class-model-01/model.ckpt-1671
INFO:tensorflow:Assets added to graph.
INFO:tensorflow:Assets written to: b"trained_models/sms-class-model-01/export/predict/temp-b'1514311107'/assets"
INFO:tensorflow:SavedModel written to: b"trained_models/sms-class-model-01/export/predict/temp-b'1514311107'/saved_model.pbtxt"
.......................................
Experiment finished at 17:58:29

Experiment elapsed time: 179.53845 seconds

5. Evaluate the Model


In [11]:
TRAIN_SIZE = 4179
TEST_SIZE = 1393

train_input_fn = lambda: input_fn(files_name_pattern= TRAIN_DATA_FILES_PATTERN, 
                                      mode= tf.estimator.ModeKeys.EVAL,
                                      batch_size= TRAIN_SIZE)

test_input_fn = lambda: input_fn(files_name_pattern= VALID_DATA_FILES_PATTERN, 
                                      mode= tf.estimator.ModeKeys.EVAL,
                                      batch_size= TEST_SIZE)

estimator = create_estimator(run_config, hparams)

train_results = estimator.evaluate(input_fn=train_input_fn, steps=1)
print()
print("######################################################################################")
print("# Train Measures: {}".format(train_results))
print("######################################################################################")

test_results = estimator.evaluate(input_fn=test_input_fn, steps=1)
print()
print("######################################################################################")
print("# Test Measures: {}".format(test_results))
print("######################################################################################")


INFO:tensorflow:Using config: {'_model_dir': 'trained_models/sms-class-model-01', '_tf_random_seed': 19830610, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 5000, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x11a14aeb8>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}

Estimator Type: <class 'tensorflow.python.estimator.estimator.Estimator'>


* data input_fn:
================
Input file(s): data/sms-spam/train-*.tsv
Batch size: 4179
Epoch Count: 1
Mode: eval
Thread Count: 4
Shuffle: False
================

INFO:tensorflow:Starting evaluation at 2017-12-26-17:58:31
INFO:tensorflow:Restoring parameters from trained_models/sms-class-model-01/model.ckpt-1671
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Finished evaluation at 2017-12-26-17:58:33
INFO:tensorflow:Saving dict for global step 1671: accuracy = 0.994477, auroc = 0.995733, global_step = 1671, loss = 0.0582446

######################################################################################
# Train Measures: {'accuracy': 0.99447709, 'auroc': 0.99573296, 'loss': 0.058244605, 'global_step': 1671}
######################################################################################

* data input_fn:
================
Input file(s): data/sms-spam/valid-*.tsv
Batch size: 1393
Epoch Count: 1
Mode: eval
Thread Count: 4
Shuffle: False
================

INFO:tensorflow:Starting evaluation at 2017-12-26-17:58:35
INFO:tensorflow:Restoring parameters from trained_models/sms-class-model-01/model.ckpt-1671
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Finished evaluation at 2017-12-26-17:58:37
INFO:tensorflow:Saving dict for global step 1671: accuracy = 0.955723, auroc = 0.971936, global_step = 1671, loss = 0.360811

######################################################################################
# Test Measures: {'accuracy': 0.95572317, 'auroc': 0.97193635, 'loss': 0.36081061, 'global_step': 1671}
######################################################################################

6. Predict Using Serving Function


In [12]:
import os

export_dir = model_dir +"/export/predict/"

saved_model_dir = export_dir + "/" + os.listdir(path=export_dir)[-1] 

print(saved_model_dir)
print("")

predictor_fn = tf.contrib.predictor.from_saved_model(
    export_dir = saved_model_dir,
    signature_def_key="prediction"
)

output = predictor_fn(
    {
        'sms':[
            'ok, I will be with you in 5 min. see you then',
            'win 1000 cash free of charge promo hot deal sexy',
            'hot girls sexy tonight call girls waiting for chat'
        ]
        
    }
)
print(output)


trained_models/sms-class-model-01/export/predict//1514311107

INFO:tensorflow:Restoring parameters from b'trained_models/sms-class-model-01/export/predict//1514311107/variables/variables'
{'class': array([b'ham', b'spam', b'ham'], dtype=object), 'probabilities': array([[ 0.0076826 ,  0.99231744],
       [ 0.9970181 ,  0.00298198],
       [ 0.08724735,  0.91275269]], dtype=float32)}

In [ ]: